How popular is your name?¶

UK ONS 1996-2021¶

Image

Sarah, Jason, Leon & Conrad¶

IMPORTS¶

In [ ]:
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
from bokeh.models import NumeralTickFormatter
from bokeh.io import output_notebook
output_notebook()

import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'

import seaborn as sns
Loading BokehJS ...

Create lists for later data wrangling¶

In [ ]:
#Setup names for our family.
females = ["Sarah"]
males = ["Jason","Leon","Conrad"]
family = females + males
In [ ]:
#Create some lists to select or reformat data later
count_cols = []     # "1996 Count", ... "2021 Count" ** Used to drop/rename columns
rank_cols = []      # "1996 Rank", ... "2021 Rank" ** Used to drop/rename columns
years = []          # "1996", ... "2021" ** Used to wrangle data and rename columns
for f in range(1996,2022):
    count_cols.append(str(f) + ' Count')
    rank_cols.append(str(f) + ' Rank')
    years.append(str(f))
In [ ]:
#Import ONS data
df1 = pd.read_excel("babynames1996to2021.xlsx","1",skiprows =7)     # Boys names in workbook 1
df2 = pd.read_excel( 'babynames1996to2021.xlsx',"2",skiprows =7)    # Girls names in workbook 2
df1 = df1.set_index("Name")
df2 = df2.set_index("Name") 
In [ ]:
#Replace NaN with 0
df1 = df1.replace("[x]",0)
df2 = df2.replace("[x]",0)

Filter our names to new dataframe¶

In [ ]:
#Filter names from ONS data
Girls = df2.loc[females]
Boys = df1.loc[males]
df_family = pd.concat([Girls, Boys], axis=0)
In [ ]:
#Drop rank and count columns
df_family_counts = df_family.drop(columns=rank_cols)
df_family_ranks = df_family.drop(columns=count_cols)
In [ ]:
#Rename columns "1996 Count" -> "1996" and "1996 Rank" -> "1996"
df_family_counts.columns = df_family_counts.columns.str.replace(" Count","")
df_family_ranks.columns = df_family_ranks.columns.str.replace(" Rank","")
In [ ]:
#Pivot DFs so rows are date observations
df_family_counts = pd.pivot_table(df_family_counts, values = years, columns=["Name"])
df_family_counts.index.name = "Year"
df_family_counts = df_family_counts.reindex(columns=family)

df_family_ranks = pd.pivot_table(df_family_ranks, values = years, columns=["Name"])
df_family_ranks.index.name = "Year"
df_family_ranks = df_family_ranks.reindex(columns=family)

Tables¶

UK ranking for selected names¶

In [ ]:
#Display the rank table for years since children born
df_family_ranks.tail(14)
Out[ ]:
Name Sarah Jason Leon Conrad
Year
2008 72 139 62 489
2009 79 145 60 473
2010 83 161 60 536
2011 90 159 62 576
2012 95 165 72 734
2013 96 155 75 611
2014 95 162 85 714
2015 96 163 85 779
2016 96 170 92 870
2017 103 179 98 861
2018 103 174 97 851
2019 107 175 93 1152
2020 115 213 93 921
2021 125 210 101 1448

UK count for selected names¶

In [ ]:
df_family_counts.tail(14)
Out[ ]:
Name Sarah Jason Leon Conrad
Year
2008 930 404 1138 71
2009 793 406 1155 76
2010 722 344 1204 64
2011 663 360 1069 61
2012 592 359 918 45
2013 574 372 837 56
2014 601 353 765 45
2015 581 355 795 41
2016 572 340 737 36
2017 530 314 669 36
2018 503 311 655 36
2019 479 302 662 23
2020 422 240 620 31
2021 403 251 590 17
In [ ]:
#Create totals DF
df1_totals = df1.drop(columns=rank_cols).sum()
df2_totals = df2.drop(columns=rank_cols).sum()
df1_totals = df1_totals.reset_index()
df2_totals = df2_totals.reset_index()
df_totals = pd.concat([df1_totals, df2_totals], axis=0)
df_totals = df_totals.replace(count_cols,years)
#df_totals = df1_totals + df2_totals
df_totals.columns=["Years","Total"]
df_totals = df_totals.groupby("Years").sum()
In [ ]:
#Calculate accumulative percentage change
df_family_diff = pd.DataFrame(df_family_counts,copy=True)
temp_col = [] # Build a list of temp columns to drop later
for name in family:
    df_family_diff[name + " Diff"] = df_family_diff[name].diff()
    temp_col.append(name + " Diff")
df_family_diff = df_family_diff.fillna(0)

for name in family:
    df_family_diff[name + " %"] = df_family_diff[name + " Diff"] / df_family_diff[name]
    df_family_diff[name + " %"] = df_family_diff[name + " %"].cumsum()

# Drop the columns not required for plotting
temp_col += family
df_family_diff = df_family_diff.drop(columns=temp_col)

Bokeh Charts¶

In [ ]:
source = ColumnDataSource(df_totals)
p = figure(title="Total UK baby names", x_axis_label="Year", y_axis_label="Babies",x_range=years, width=1100, height=500)
p.line(source=source, x="Years",y="Total")
show(p)
In [ ]:
# create a new plot with a title and axis labels
source = ColumnDataSource(df_family_counts)
p = figure(title="UK baby count for selected names", x_axis_label="", y_axis_label="Babies",x_range=years, width=1100, height=500)

# add a line renderer with legend and line thickness
p.line(x = "Year", y = "Sarah", legend_label="Sarah", line_color="red",line_width=2,source=source)
p.line(x = "Year", y = "Jason", legend_label="Jason", line_width=2,source=source)
p.line(x = "Year", y = "Leon", legend_label="Leon", line_color="yellow",line_width=2,source=source)
p.line(x = "Year", y = "Conrad", legend_label="Conrad", line_color="darkgrey",line_width=2,source=source)

# show the results
show(p)
In [ ]:
# create a new plot with a title and axis labels
source = ColumnDataSource(df_family_ranks)
p = figure(title="UK baby name rankings", x_axis_label="Year",y_range=(1500,1), y_axis_label="Name UK rank",x_range=years, width=1100, height=500)

# add a line renderer with legend and line thickness

p.line(x = "Year", y = "Sarah", legend_label="Sarah", line_color="red",line_width=2,source=source)
p.line(x = "Year", y = "Jason", legend_label="Jason", line_width=2,source=source)
p.line(x = "Year", y = "Leon", legend_label="Leon", line_color="yellow",line_width=2,source=source)
p.line(x = "Year", y = "Conrad", legend_label="Conrad", line_color="darkgrey",line_width=2,source=source)

# show the results
show(p)
In [ ]:
# create a new plot with a title and axis labels
source = ColumnDataSource(df_family_diff)
p = figure(title="Name Popularity % Change", x_axis_label="Year", y_axis_label="%",x_range=years, width=1100, height=500)

# add a line renderer with legend and line thickness

p.line(x = "Year", y = "Sarah %", legend_label="Sarah", line_color="red",line_width=2,source=source)
p.line(x = "Year", y = "Jason %", legend_label="Jason", line_width=2,source=source)
p.line(x = "Year", y = "Leon %", legend_label="Leon", line_color="yellow",line_width=2,source=source)
p.line(x = "Year", y = "Conrad %", legend_label="Conrad", line_color="darkgrey",line_width=2,source=source)

# show the results
show(p)

Matplotlib Charts¶

In [ ]:
df_totals.plot(title="Total UK baby names",figsize=(15,6),grid=True)
Out[ ]:
<AxesSubplot:title={'center':'Total UK baby names'}, xlabel='Years'>
In [ ]:
df_family_counts.plot(title="UK baby count for selected names",figsize=(15,6),grid=True)
Out[ ]:
<AxesSubplot:title={'center':'UK baby count for selected names'}, xlabel='Year'>
In [ ]:
df_family_ranks.plot(title="UK baby name rankings",figsize=(15,6),grid=True)
plt.ylim(1500,1)
Out[ ]:
(1500.0, 1.0)
In [ ]:
df_family_diff.plot(title="Name Popularity % Change",figsize=(15,6),grid=True)
Out[ ]:
<AxesSubplot:title={'center':'Name Popularity % Change'}, xlabel='Year'>

Seaborn Charts¶

In [ ]:
fig, ax = plt.subplots(figsize=(15, 6))
sns.lineplot(df_totals).set(title="Total UK baby names")
plt.grid(True)
In [ ]:
fig, ax = plt.subplots(figsize=(15, 6))
sns.lineplot(df_family_counts).set(title="UK baby count for selected names")
plt.grid(True)
In [ ]:
fig, ax = plt.subplots(figsize=(15,6))
sns.lineplot(df_family_ranks).set(title="UK baby rank for selected names")
plt.ylim(1500,1)
plt.grid(True)
In [ ]:
fig, ax = plt.subplots(figsize=(15,6))
sns.lineplot(df_family_diff).set(title="Name Popularity % Change")
plt.grid(True)

Plotly Charts¶

In [ ]:
px.line(df_totals, title = "Total UK baby names")
In [ ]:
px.line(df_family_counts, title="UK baby count for selected names")
In [ ]:
px.line(df_family_ranks, title="UK baby rank for selected names")
fig.update_yaxes(autorange="reversed")
In [ ]:
px.line(df_family_diff, title="Name Popularity % Change")